Healthcare Index
Data Cleaning and Merge Datasets
We created a new dataset that contains the information for the Healthcare Index, Oil Production and Oil price. Index and Production variables have been transformed in order to have a closer scale to the Price variable.
{r}
# Read csv
df_healthcare_companies <- read.csv('../data/viz_healthcare_companies.csv')
# Filter data
df_healthcare_companies <- df_healthcare_companies %>% dplyr::select('date', 'TMO')
# Rename columns
names(df_healthcare_companies) <- c('date', 'adjusted')
# Change data type
df_healthcare_companies$date <- as.Date(df_healthcare_companies$date)
# Create a sequence of dates from start_date to end_date
start_date <- as.Date(min(df_healthcare_companies$date))
end_date <- as.Date(max(df_healthcare_companies$date))
# Create date sequence
date_range <- seq(start_date, end_date, by = "1 day")
# Create a dataset with the date range
date_dataset <- data.frame(Date = date_range)
# Merge dataframes
df_healthcare_companies <- merge(df_healthcare_companies, date_dataset, by.x = "date", by.y = "Date", all = TRUE)
# Check for missing values
# is.na(df_oil_price$adjusted)
# Extract rows with missing values
df_na_rows <- df_healthcare_companies[which(rowSums(is.na(df_healthcare_companies)) > 0),]
# Extract columns with missing values
df_na_cols <- df_healthcare_companies[, which(colSums(is.na(df_healthcare_companies)) > 0)]
# Modify data
imputed_time_series <- na_ma(df_healthcare_companies, k = 4, weighting = "exponential")
# Add modified data
df_healthcare_companies <- data.frame(imputed_time_series)
# Change data type
df_healthcare_companies$date <- as.Date(df_healthcare_companies$date,format = "%m/%d/%y")
# Create Date separte terms columns
df_healthcare_companies_monthly <- df_healthcare_companies %>%
mutate(Year = lubridate::year(date),
Month = lubridate::month(date),
Day = lubridate::day(date))
# Group by Year Month and get the maximum day
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>%
group_by(Year, Month) %>%
summarize(Max_Day = max(Day))
# Create Date
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>%
mutate(date = make_date(Year, Month, Max_Day))
# Merge datasets
df_healthcare_companies_monthly <- merge(df_healthcare_companies_monthly, df_healthcare_companies, by = "date", all.x = TRUE)
# Keep relevant columns
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>% dplyr::select("date", "adjusted")
# Rename columns
names(df_healthcare_companies_monthly) <- c('Date', 'Index')
# Save ts as a new file
write.csv(df_healthcare_companies_monthly, '../data/df_healthcare_companies_monthly.csv', row.names = FALSE)
# ---------------------------------------------------------------------------------------------------------------------------------
# Import dataset
df_healthcare_companies_monthly <- as.data.frame(read_csv('../data/df_healthcare_companies_monthly.csv'))
# Filter information
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>% filter(year(Date) >= 2000 & year(Date) <= 2022)
# Create Date
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>%
mutate(date2 = make_date(year(Date), month(Date), 01))
# Import dataset
df_oil_production <- as.data.frame(read_csv('../data/viz_us_oil_production.csv'))
# Filter information
df_oil_production <- df_oil_production %>% filter(year(Date) >= 2000 & year(Date) <= 2022)
# Create Date
df_oil_production <- df_oil_production %>%
mutate(date2 = make_date(year(Date), month(Date), 01))
# Import dataset
df_oil_price <- as.data.frame(read_csv('../data/df_oil_price_monthly.csv'))
# Create Date
df_oil_price <- df_oil_price %>%
mutate(date2 = make_date(year(date), month(date), 01))
df_healthcare_companies_monthly$date2 <- as.Date(df_healthcare_companies_monthly$date2)
df_oil_production$date2 <- as.Date(df_oil_production$date2)
df_oil_price$date2 <- as.Date(df_oil_price$date2)
# List of minimum dates
dates <- c(min(df_healthcare_companies_monthly$date2), min(df_oil_production$date2),min(df_oil_price$date2))
min_date <- max(dates)
# Filter starting date
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>% filter(date2 >= min_date)
# Filter starting date
df_oil_production <- df_oil_production %>% filter(date2 >= min_date)
# Filter starting date
df_oil_price <- df_oil_price %>% filter(date2 >= min_date)
# Keep relevant columns
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>% dplyr::select('date2', 'Index')
# Convert to Log
df_healthcare_companies_monthly$Index <- log(df_healthcare_companies_monthly$Index)
# Keep relevant columns
df_oil_production <- df_oil_production %>% dplyr::select('date2', 'Production')
# Convert to Log
df_oil_production$Production <- log(df_oil_production$Production)
# Keep relevant columns
df_oil_price <- df_oil_price %>% dplyr::select('date2', 'adjusted')
# List of dataframes
df_list <- list(df_healthcare_companies_monthly, df_oil_production, df_oil_price)
# Combine datasets
dd <- merge_dataframes(df_list)
# Rename columns
names(dd) <- c('DATE', 'Index', 'Production', 'Price')
# Order by Date sort ascending
dd <- dd %>% dplyr::arrange(DATE)
# # Create the time series object
# dd.ts <- ts(dd,star=decimal_date(min_date),frequency = 12)
# Show table
knitr::kable(head(dd))| DATE | Index | Production | Price |
|---|---|---|---|
| 2000-08-01 | 2.942912 | 8.663715 | 1.0327613 |
| 2000-09-01 | 3.084701 | 8.658345 | 0.9740746 |
| 2000-10-01 | 3.169293 | 8.667164 | 1.0199688 |
| 2000-11-01 | 3.169293 | 8.671287 | 1.0546022 |
| 2000-12-01 | 3.178951 | 8.675051 | 0.8429909 |
| 2001-01-01 | 3.191460 | 8.665441 | 0.8954759 |
Plot
In the plot below we can observe the time series for the variables Index, Production and Price from 2020 to 2022.
{r}
plot <- ggplot(dd, aes(x = DATE)) +
geom_line(aes(y = Index, color = "Index"), alpha = 0.7) +
geom_line(aes(y = Production, color = "Production"), alpha = 0.7) +
geom_line(aes(y = Price, color = "Price"), alpha = 0.7) +
labs(title = "Healthcare Index ~ Oil Price + Oil Production",
x = "Date",
y = "Value",
color = "Variables") + # Set legend title
scale_color_manual(values = c(Index = "red", Production = "blue", Price = "green")) +
theme_minimal()
plot %>% ggplotly()Linear Models
In FIT 1 we can observe the first linear model where the Index is the predictor variable and Oil Production and Oil Price are the independent variables. The variable Oil Price has a p-value higher than the significance level of 0.05, so it is not significant.
In FIT 2 we can see the second linear model where the Index is the predictor variable and Oil Production is the independent variable. The model has good R-squared and RMSE values.
{r}
fit_1 <- lm(Index ~ ., data = dd)
summary(fit_1)
Call:
lm(formula = Index ~ ., data = dd)
Residuals:
Min 1Q Median 3Q Max
-0.49818 -0.13909 -0.06932 0.17576 0.51160
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7.863e+00 8.388e-01 -9.375 < 2e-16 ***
DATE 3.373e-04 1.708e-05 19.752 < 2e-16 ***
Production 7.903e-01 1.180e-01 6.695 1.28e-10 ***
Price 2.301e-02 2.316e-02 0.994 0.321
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2202 on 265 degrees of freedom
Multiple R-squared: 0.9574, Adjusted R-squared: 0.9569
F-statistic: 1985 on 3 and 265 DF, p-value: < 2.2e-16
{r}
fit_2 <- lm(Index ~ Production, data = dd)
summary(fit_2)
Call:
lm(formula = Index ~ Production, data = dd)
Residuals:
Min 1Q Median 3Q Max
-1.02182 -0.31398 -0.01122 0.35995 1.37957
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -22.06617 0.73678 -29.95 <2e-16 ***
Production 2.97392 0.08291 35.87 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4405 on 267 degrees of freedom
Multiple R-squared: 0.8281, Adjusted R-squared: 0.8275
F-statistic: 1287 on 1 and 267 DF, p-value: < 2.2e-16
R^2: 0.8281319
RMSE: 0.43888
Stationarity
The ACF plot shows high autocorrelation in the lags, so the time series is not stationary. Evaluating the results of the Augmented Dickey-Fuller test, the p-value is higher than the significance level of 0.05. Therefore, we cannot reject the null hypothesis that the time series is stationary.
Time Series, ACF Plot and PACF Plot
{r}
lm.residuals <- residuals(fit_2)
lm.residuals %>% ggtsdisplay()Augmented Dickey-Fuller Test
{r}
adf_test <- adf.test(lm.residuals)
print(adf_test)
Augmented Dickey-Fuller Test
data: lm.residuals
Dickey-Fuller = -2.0076, Lag order = 6, p-value = 0.5728
alternative hypothesis: stationary
Differencing
We apply the first difference to the residuals of the linear model to obtain a stationary time series. We can observe the volatility of the residuals and two main volatility clusters in the data points near 100 and near 250.
In the acf plot, we can observe that the differenced residuals look stationary.
{r}
lm.residuals %>% diff() %>% ggtsdisplay()Model Selection
Based on the ACF and PACF plot for the first difference on residuals, we define:
\(p = 2\)
\(d = 1\)
\(q = 2\)
Next we will run the model diagnostics on the models that have the minimum AIC, minimum BIC and the auto.arima() results.
Model Parameters
{r}
xt <- lm.residuals
p_value <- 2
d_value <- 1
q_value <- 2
i <- 1
temp <- data.frame()
rows <- (p_value+1)*(d_value+1)*(q_value+1)
ls <- matrix(rep(NA,6*rows),nrow=rows)
for (p in 0:p_value+1)
{
for(q in 0:q_value+1)
{
for(d in 0:d_value)
{
if(p-1+d+q-1<=8) #usual threshold
{
model<- Arima(xt,order=c(p-1,d,q-1),include.drift=TRUE)
ls[i,]= c(p-1,d,q-1,model$aic,model$bic,model$aicc)
i=i+1
#print(i)
}
}
}
}
temp <- as.data.frame(ls)
temp <- na.omit(temp)
names(temp) <- c("p","d","q","AIC","BIC","AICc")
#temp
knitr::kable(temp)| p | d | q | AIC | BIC | AICc |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 279.53081 | 290.31494 | 279.62137 |
| 0 | 1 | 0 | -335.92247 | -328.74050 | -335.87719 |
| 0 | 0 | 1 | 21.29335 | 35.67219 | 21.44486 |
| 0 | 1 | 1 | -335.55719 | -324.78423 | -335.46628 |
| 0 | 0 | 2 | -114.66152 | -96.68796 | -114.43338 |
| 0 | 1 | 2 | -338.38902 | -324.02507 | -338.23693 |
| 1 | 0 | 0 | -338.26285 | -323.88401 | -338.11134 |
| 1 | 1 | 0 | -335.12381 | -324.35085 | -335.03290 |
| 1 | 0 | 1 | -336.93043 | -318.95688 | -336.70230 |
| 1 | 1 | 1 | -338.39144 | -324.02749 | -338.23935 |
| 1 | 0 | 2 | -338.27166 | -316.70339 | -337.95105 |
| 1 | 1 | 2 | -337.46060 | -319.50566 | -337.23159 |
| 2 | 0 | 0 | -336.77331 | -318.79975 | -336.54517 |
| 2 | 1 | 0 | -337.55096 | -323.18701 | -337.39886 |
| 2 | 0 | 1 | -337.53053 | -315.96226 | -337.20992 |
| 2 | 1 | 1 | -337.47950 | -319.52457 | -337.25049 |
| 2 | 0 | 2 | -336.77357 | -311.61059 | -336.34445 |
| 2 | 1 | 2 | -335.48229 | -313.93637 | -335.16045 |
Model Summary
{r}
temp[which.min(temp$AIC),] p d q AIC BIC AICc
10 1 1 1 -338.3914 -324.0275 -338.2393
{r}
temp[which.min(temp$BIC),] p d q AIC BIC AICc
2 0 1 0 -335.9225 -328.7405 -335.8772
{r}
temp[which.min(temp$AICc),] p d q AIC BIC AICc
10 1 1 1 -338.3914 -324.0275 -338.2393
auto.arima()
{r}
# Assign the exogenous variable
fit_auto_arima <- auto.arima(xt)
summary(fit_auto_arima)Series: xt
ARIMA(0,1,2)
Coefficients:
ma1 ma2
-0.0965 -0.1310
s.e. 0.0607 0.0599
sigma^2 = 0.01622: log likelihood = 172.96
AIC=-339.91 AICc=-339.82 BIC=-329.14
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set 0.00548734 0.12666 0.08145436 77.30622 163.8885 0.991597
ACF1
Training set 0.007261496
Model Diagnostics
Comparing the results of the model diagnostics, we can observe that the ARIMA(0, 1, 2) model is the best. The lag values in the ACF plot are within the confidence bands, so the residuals are stationary. The standard residuals plotted in the Normal Q-Q plot show the similarity to normality. The p-values for the Ljung-Box statistic confirm that the residuals are stationary.
{r}
AIC <- temp[which.min(temp$AIC),]
p1 <- AIC$p
d1 <- AIC$d
q1 <- AIC$q
model_output <- capture.output(sarima(xt, p1, d1, q1)){r}
BIC <- temp[which.min(temp$BIC),]
p2 <- BIC$p
d2 <- BIC$d
q2 <- BIC$q
model_output <- capture.output(sarima(xt, p2, d2, q2)){r}
p3 <- 0
d3 <- 1
q3 <- 2
model_output <- capture.output(sarima(xt, p3, d3, q3))Residuals
We plot the residuals and the residuals squared of the arima model using the ACF and PACF plot to determine the parameters for the GARCH model.
The values that will be used to assess the model are the following:
\(p = 1\)
\(q = 2\)
Residuals ACF
{r}
arima_model <- Arima(xt,order=c(p3, d3, q3),include.drift = TRUE)
arima.res <- residuals(arima_model)
acf(arima.res)Residuals^2 ACF
{r}
acf(arima.res^2)Residuals^2 PACF
{r}
pacf(arima.res^2)GARCH Model
After running the GARCH models possibilities we observe that the model with the lowest AIC is GARCH(2, 1). The summary of the model shows that the beta1 component is not significant, therefore we decide to remove that component and test the GARCH(2, 0) model. For the second model, all the components are significant and the resulting AIC is lower compared to the AIC value of the GARCH(2, 1) model.
{r}
model <- list() ## set counter
cc <- 1
for (p in 1:2) {
for (q in 1:2) {
model[[cc]] <- garch(arima.res,order=c(q,p),trace=F)
cc <- cc + 1
}
}
GARCH_AIC <- sapply(model, AIC)
model[[which(GARCH_AIC == min(GARCH_AIC))]]
Call:
garch(x = arima.res, order = c(q, p), trace = F)
Coefficient(s):
a0 a1 a2 b1
0.006728 0.388092 0.280750 0.020268
{r}
summary(garchFit(~garch(2,1), arima.res, trace = F))
Title:
GARCH Modelling
Call:
garchFit(formula = ~garch(2, 1), data = arima.res, trace = F)
Mean and Variance Equation:
data ~ garch(2, 1)
<environment: 0x130e6cd18>
[data = arima.res]
Conditional Distribution:
norm
Coefficient(s):
mu omega alpha1 alpha2 beta1
-0.0017586 0.0067031 0.3843606 0.2806766 0.0200725
Std. Errors:
based on Hessian
Error Analysis:
Estimate Std. Error t value Pr(>|t|)
mu -0.001759 0.006263 -0.281 0.77887
omega 0.006703 0.001252 5.352 8.68e-08 ***
alpha1 0.384361 0.120258 3.196 0.00139 **
alpha2 0.280677 0.102399 2.741 0.00612 **
beta1 0.020073 0.067667 0.297 0.76674
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Log Likelihood:
210.7997 normalized: 0.783642
Description:
Fri Dec 8 02:25:03 2023 by user:
Standardised Residuals Tests:
Statistic p-Value
Jarque-Bera Test R Chi^2 574.5326941 0.000000e+00
Shapiro-Wilk Test R W 0.9136106 2.422078e-11
Ljung-Box Test R Q(10) 6.9532841 7.298477e-01
Ljung-Box Test R Q(15) 10.7342224 7.711973e-01
Ljung-Box Test R Q(20) 12.1731000 9.099891e-01
Ljung-Box Test R^2 Q(10) 2.2231241 9.943263e-01
Ljung-Box Test R^2 Q(15) 3.0672555 9.995386e-01
Ljung-Box Test R^2 Q(20) 3.7192706 9.999745e-01
LM Arch Test R TR^2 2.2067775 9.990169e-01
Information Criterion Statistics:
AIC BIC SIC HQIC
-1.530109 -1.463293 -1.530784 -1.503276
{r}
summary(garchFit(~garch(2,0), arima.res, trace = F))
Title:
GARCH Modelling
Call:
garchFit(formula = ~garch(2, 0), data = arima.res, trace = F)
Mean and Variance Equation:
data ~ garch(2, 0)
<environment: 0x1374deac8>
[data = arima.res]
Conditional Distribution:
norm
Coefficient(s):
mu omega alpha1 alpha2
-0.0017586 0.0069147 0.3836984 0.2883843
Std. Errors:
based on Hessian
Error Analysis:
Estimate Std. Error t value Pr(>|t|)
mu -0.001759 0.006278 -0.280 0.77938
omega 0.006915 0.001099 6.292 3.14e-10 ***
alpha1 0.383698 0.120256 3.191 0.00142 **
alpha2 0.288384 0.097400 2.961 0.00307 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Log Likelihood:
210.7456 normalized: 0.783441
Description:
Fri Dec 8 02:25:03 2023 by user:
Standardised Residuals Tests:
Statistic p-Value
Jarque-Bera Test R Chi^2 565.2641722 0.000000e+00
Shapiro-Wilk Test R W 0.9143988 2.796152e-11
Ljung-Box Test R Q(10) 7.0370496 7.219427e-01
Ljung-Box Test R Q(15) 10.7816832 7.679114e-01
Ljung-Box Test R Q(20) 12.2495164 9.072201e-01
Ljung-Box Test R^2 Q(10) 2.1211064 9.953248e-01
Ljung-Box Test R^2 Q(15) 2.9819760 9.996126e-01
Ljung-Box Test R^2 Q(20) 3.6504702 9.999782e-01
LM Arch Test R TR^2 2.1341298 9.991709e-01
Information Criterion Statistics:
AIC BIC SIC HQIC
-1.537142 -1.483689 -1.537576 -1.515675
Model Equation
\(Y_{t} = \mu - 0.00067 + \omega \times 0.00175 + \alpha_1 \times \varepsilon_{t-1}^2 + \alpha_2 \times \varepsilon_{t-2}^2 + \varepsilon_t\)
\(\mu - 0.00067 + \omega \times 0.00175 + \alpha_1 \times 0.62634 + \alpha_2 \times 0.36708\)
Forecast Plot
Below we can observe the Forcast Plot with the predictions from the ARIMA(0, 1, 2) + GARCH(2, 0) model.
{r}
final.fit <- garchFit(~garch(2,0), arima.res, trace = F)
plot <- predict(final.fit, n.ahead = 100, plot=TRUE)